# !diagnostics off
library(tidyverse) ; library(reshape2) ; library(glue) ; library(plotly) ; library(plotlyutils)
library(RColorBrewer) ; library(viridis) ; require(gridExtra) ; library(GGally) ; library(ggpubr)
library(Rtsne)
library(ClusterR)
library(DESeq2)
library(expss)
library(knitr) ; library(kableExtra)
Load preprocessed dataset (preprocessing code in 01_data_preprocessing.Rmd)
# Gandal dataset
load('./../Data/preprocessed_data.RData')
datExpr = datExpr %>% data.frame
rownames(datExpr) = datGenes$ensembl_gene_id
DE_info = DE_info %>% data.frame
datMeta = datMeta %>% mutate(ID = title)
# GO Neuronal annotations: regex 'neuron' in GO functional annotations and label the genes that make a match as neuronal
GO_annotations = read.csv('./../Data/genes_GO_annotations.csv')
GO_neuronal = GO_annotations %>% filter(grepl('neuron', go_term)) %>%
mutate('ID'=as.character(ensembl_gene_id)) %>%
dplyr::select(-ensembl_gene_id) %>% distinct(ID) %>%
mutate('Neuronal'=1)
# SFARI Genes
SFARI_genes = read_csv('./../../../SFARI/Data/SFARI_genes_01-03-2020_w_ensembl_IDs.csv')
SFARI_genes = SFARI_genes[!duplicated(SFARI_genes$ID) & !is.na(SFARI_genes$ID),]
# Update DE_info with SFARI and Neuronal information
genes_info = DE_info %>% mutate('entrezgene'=rownames(.) %>% as.numeric) %>%
dplyr::rename('padj' = adj.P.Val, 'log2FoldChange' = logFC) %>%
left_join(datGenes %>% dplyr::select(entrezgene, ensembl_gene_id) %>%
dplyr::rename('ID' = ensembl_gene_id), by = 'entrezgene') %>%
left_join(SFARI_genes, by='ID') %>%
mutate(`gene-score`=ifelse(is.na(`gene-score`), 'Others', `gene-score`)) %>%
distinct(ID, .keep_all = TRUE) %>% left_join(GO_neuronal, by='ID') %>%
mutate(Neuronal=ifelse(is.na(Neuronal), 0, Neuronal)) %>%
mutate(gene.score=ifelse(`gene-score`=='Others' & Neuronal==1, 'Neuronal', `gene-score`),
significant=padj<0.05 & !is.na(padj)) %>%
mutate(Group = factor(ifelse(gene.score %in% c('Neuronal','Others'), gene.score, 'SFARI'),
levels = c('SFARI', 'Neuronal', 'Others')))
SFARI_colour_hue = function(r) {
pal = c('#FF7631','#FFB100','#E8E328','#8CC83F','#62CCA6','#59B9C9','#b3b3b3','#808080','gray','#d9d9d9')[r]
}
rm(GO_annotations)
There are 912 genes with a SFARI score, but to map them to the gene expression dataset we had to map the gene names to their corresponding ensembl IDs
There are 1116 Ensembl IDs corresponding to the 912 genes in the SFARI Gene dataset
Since a gene can have more than one ensembl ID, there were some one-to-many mappings between a gene name and ensembl IDs, so that’s why we ended up with 1116 rows in the SFARI_genes dataset.
The details about how the genes were annotated with their Ensembl IDs can be found in SecondYear/SFARI/RMarkdowns/get_ensembl_ids_new_SFARI.html
There are 87 genes in the SFARI list without a score, of which 0 don’t have syndromic tag either
There are 782 SFARI Genes in the expression dataset (~70%)
Of these, only 713 have an assigned score
From now on, we’re only going to focus on these 713 genes with a score
Gene count by SFARI score:
table_info = genes_info %>% apply_labels(`gene-score` = 'SFARI Gene Score', syndromic = 'Syndromic Tag',
Neuronal = 'Neuronal Function', gene.score = 'Gene Score') %>%
mutate(syndromic = as.logical(syndromic), Neuronal = as.logical(Neuronal))
cro(table_info$`gene-score`)
|  #Total | |
|---|---|
|  SFARI Gene Score | |
| Â Â Â 1Â | 114 |
| Â Â Â 2Â | 188 |
| Â Â Â 3Â | 411 |
|    Others | 14679 |
|    #Total cases | 15392 |
Gene count by Syndromic tag:
cro(table_info$syndromic)
|  #Total | |
|---|---|
|  Syndromic Tag | |
| Â Â Â FALSEÂ | 677 |
| Â Â Â TRUEÂ | 105 |
|    #Total cases | 782 |
GO Neuronal annotations:
1042 genes have neuronal-related annotations
149 of these genes have a SFARI score
cro(table_info$gene.score[genes_info$`gene-score` %in% as.character(c(1:3))],
list(table_info$Neuronal[genes_info$`gene-score` %in% as.character(c(1:3))], total()))
|  Neuronal Function |  #Total | |||
|---|---|---|---|---|
| Â FALSEÂ | Â TRUEÂ | Â | ||
|  Gene Score | ||||
| Â Â Â 1Â | 80 | 34 | Â | 114 |
| Â Â Â 2Â | 144 | 44 | Â | 188 |
| Â Â Â 3Â | 340 | 71 | Â | 411 |
|    #Total cases | 564 | 149 |  | 713 |
rm(table_info)
Significantly Larger mean expression and smaller SD than the other two groups
Note: SD plot was truncated so not all outliers are included
plot_data = data.frame('ID'=rownames(datExpr), 'MeanExpr'=rowMeans(datExpr), 'SDExpr'=apply(datExpr,1,sd)) %>%
left_join(genes_info, by='ID') %>%
mutate(Group = factor(ifelse(gene.score %in% c('Neuronal','Others'), gene.score, 'SFARI'),
levels = c('SFARI', 'Neuronal', 'Others')))
comparisons = list(c('SFARI','Neuronal'), c('Neuronal','Others'), c('SFARI','Others'))
increase = 1.5
base = 11
pos_y_comparisons = c(1:3*increase + base)
p1 = plot_data %>% ggplot(aes(Group, MeanExpr, fill=Group)) +
geom_boxplot(outlier.colour='#cccccc', outlier.shape='o', outlier.size=3) +
stat_compare_means(comparisons = comparisons, label = 'p.signif', method = 't.test',
method.args = list(var.equal = FALSE), label.y = pos_y_comparisons, tip.length = .02) +
scale_fill_manual(values=c('#00A4F7', SFARI_colour_hue(r=c(8,7)))) +
xlab('') + ylab('Mean Expression') + ggtitle('Mean Expression Comparison') +
theme_minimal() + theme(legend.position='none')
increase = 0.07
base = 0.78
pos_y_comparisons = c(1:3*increase + base)
p2 = plot_data %>% ggplot(aes(Group, SDExpr, fill=Group)) +
geom_boxplot(outlier.colour='#cccccc', outlier.shape='o', outlier.size=3) +
stat_compare_means(comparisons = comparisons, label = 'p.signif', method = 't.test',
method.args = list(var.equal = FALSE), label.y = pos_y_comparisons, tip.length = .01) +
scale_fill_manual(values=c('#00A4F7', SFARI_colour_hue(r=c(8,7)))) +
coord_cartesian(ylim= c(0.05, max(pos_y_comparisons))) +
xlab('') + ylab('Standard Deviation') + ggtitle('Standard Deviation Comparison') +
theme_minimal() + theme(legend.position='none')
grid.arrange(p1, p2, nrow=1)
rm(p1, p2, increase, base, pos_y_comparisons)
Proportion of over- and under-expressed genes is very similar between groups: approximately half
genes_info %>% mutate(direction = ifelse(log2FoldChange>0, 'over-expressed', 'under-expressed')) %>%
group_by(Group, direction) %>% tally(name = 'over_expressed') %>%
filter(direction == 'over-expressed') %>% ungroup %>%
left_join(genes_info %>% group_by(Group) %>% tally(name = 'Total'), by = 'Group') %>% ungroup %>%
mutate('prop_over_expressed' = round(over_expressed/Total,3)) %>%
dplyr::select(-direction) %>% kable %>% kable_styling(full_width = F)
| Group | over_expressed | Total | prop_over_expressed |
|---|---|---|---|
| SFARI | 336 | 713 | 0.471 |
| Neuronal | 355 | 893 | 0.398 |
| Others | 6392 | 13786 | 0.464 |
Significantly lower LFC Magnitude than the rest of the genes
increase = 0.02
base = 0.39
pos_y_comparisons = c(1:3*increase + base)
plot_data %>% ggplot(aes(Group, abs(log2FoldChange), fill=Group)) +
geom_boxplot(outlier.colour='#cccccc', outlier.shape='o', outlier.size=3) +
stat_compare_means(comparisons = comparisons, label = 'p.signif', method = 't.test',
method.args = list(var.equal = FALSE), label.y = pos_y_comparisons,
tip.length = .002) +
scale_fill_manual(values=c('#00A4F7', SFARI_colour_hue(r=c(8,7)))) +
coord_cartesian(ylim= c(0.05, max(pos_y_comparisons))) +
xlab('') + ylab('LFC Magnitude') + ggtitle('LFC Magnitude Comparison') +
theme_minimal() + theme(legend.position='none')
rm(increase, base, pos_y_comparisons)
SFARI Genes, as a group, have less genes with high positive LFC than the rest of the genes in the dataset
A similar thing seems to be happening with high negative LFC, but not as strongly as the positive ones
plot_data = genes_info %>% dplyr::select(Group, log2FoldChange) %>%
mutate(quant = cut(log2FoldChange, breaks = quantile(log2FoldChange, probs = seq(0,1,0.05)) %>%
as.vector, labels = FALSE),
value_range = cut(log2FoldChange, breaks = quantile(log2FoldChange, probs=seq(0,1,0.05)) %>%
as.vector)) %>%
filter(Group == 'SFARI') %>% group_by(quant, value_range) %>% tally %>% ungroup %>%
left_join(genes_info %>% dplyr::select(Group, log2FoldChange) %>%
mutate(quant = cut(log2FoldChange, breaks = quantile(log2FoldChange,
probs = seq(0,1,0.05)) %>% as.vector, labels = FALSE)) %>%
group_by(quant) %>% tally(name = 'tot') %>% ungroup) %>% mutate(p = 100*n/tot)
ggplotly(plot_data %>% ggplot(aes(quant, p)) + geom_smooth(color = 'gray', alpha = 0.1) +
geom_bar(stat = 'identity', fill = '#00A4F7', aes(id = value_range)) +
geom_hline(yintercept = 100*mean(genes_info$Group == 'SFARI'), color = 'gray', linetype = 'dotted') +
xlab('Log Fold Change Quantiles') + ylab('% of SFARI Genes in each Quantile') + ggtitle('
Distribution of SFARI Genes in LFC Quantiles') + theme_minimal())
data.frame('Quantile' = 1:20, 'LFC Range' = cut(genes_info$log2FoldChange,
breaks = quantile(genes_info$log2FoldChange, probs=seq(0,1,.05)) %>% as.vector) %>% table %>% names) %>%
kable(caption = 'LFC ranges for each quantile') %>% kable_styling(full_width = F)
| Quantile | LFC.Range |
|---|---|
| 1 | (-2.86,-0.355] |
| 2 | (-0.355,-0.224] |
| 3 | (-0.224,-0.165] |
| 4 | (-0.165,-0.128] |
| 5 | (-0.128,-0.101] |
| 6 | (-0.101,-0.0784] |
| 7 | (-0.0784,-0.0602] |
| 8 | (-0.0602,-0.043] |
| 9 | (-0.043,-0.027] |
| 10 | (-0.027,-0.0118] |
| 11 | (-0.0118,0.0037] |
| 12 | (0.0037,0.0199] |
| 13 | (0.0199,0.0366] |
| 14 | (0.0366,0.0545] |
| 15 | (0.0545,0.075] |
| 16 | (0.075,0.1] |
| 17 | (0.1,0.133] |
| 18 | (0.133,0.185] |
| 19 | (0.185,0.291] |
| 20 | (0.291,2.44] |
With only two differentially expressed genes, it doesn’t make sense to do this analysis
The higher the SFARI score, the higher the mean expression of the gene: This pattern is quite strong and it doesn’t have any biological interpretation, so it’s probably bias in the SFARI score assignment
The higher the SFARI score, the lower the standard deviation: This pattern is not as strong, but it is weird because the data was originally heteroscedastic with a positive relation between mean and variance, so the fact that the relation now seems to have reversed could mean that the vst normalisation ended up affecting the highly expressed genes more than it should have when trying to correct their higher variance
plot_data = data.frame('ID'=rownames(datExpr), 'MeanExpr'=rowMeans(datExpr), 'SDExpr'=apply(datExpr,1,sd)) %>%
left_join(genes_info, by='ID')
comparisons = list(c('1','2'), c('2','3'), c('3','Neuronal'), c('Neuronal','Others'),
c('1','3'), c('3','Others'), c('2','Neuronal'),
c('1','Neuronal'), c('2','Others'), c('1','Others'))
increase = 1.2
base = 12
pos_y_comparisons = c(rep(base, 4), rep(base + increase, 2), base + 2:5*increase)
p1 = plot_data %>% ggplot(aes(gene.score, MeanExpr, fill=gene.score)) +
geom_boxplot(outlier.colour='#cccccc', outlier.shape='o', outlier.size=3) +
stat_compare_means(comparisons = comparisons, label = 'p.signif', method = 't.test',
method.args = list(var.equal = FALSE), label.y = pos_y_comparisons, tip.length = .02) +
scale_fill_manual(values=SFARI_colour_hue(r=c(1:3,8,7))) +
xlab('SFARI Gene Scores') + ylab('Mean Expression') +
theme_minimal() + theme(legend.position='none')
increase = 0.06
base = 0.85
pos_y_comparisons = c(rep(base, 4), rep(base + increase, 2), base + 2:5*increase)
p2 = plot_data %>% ggplot(aes(gene.score, SDExpr, fill=gene.score)) +
geom_boxplot(outlier.colour='#cccccc', outlier.shape='o', outlier.size=3) +
stat_compare_means(comparisons = comparisons, label = 'p.signif', method = 't.test',
method.args = list(var.equal = FALSE), label.y = pos_y_comparisons, tip.length = .005) +
scale_fill_manual(values=SFARI_colour_hue(r=c(1:3,8,7))) +
coord_cartesian(ylim= c(0.05, max(pos_y_comparisons))) +
xlab('SFARI Gene Scores') + ylab('Standard Deviation') +
theme_minimal() + theme(legend.position='none')
grid.arrange(p1, p2, nrow=1)
rm(p1, p2, base, increase, pos_y_comparisons)
Just to corroborate that the relation between sd and SFARI score used to be in the opposite direction before the preprocessing I made: The higher the SFARI score the higher the mean expression and the lower the standard deviation
# Save preprocessed results
datExpr_prep = datExpr
datMeta_prep = datMeta
genes_info_prep = genes_info
load('./../Data/filtered_raw_data.RData')
plot_data = data.frame('entrezgene'=rownames(datExpr), 'MeanExpr'=rowMeans(datExpr),
'SDExpr'=apply(datExpr,1,sd)) %>%
left_join(datGenes %>% dplyr::select(entrezgene, ensembl_gene_id) %>%
mutate(entrezgene = entrezgene %>% as.factor) %>%
dplyr::rename('ID' = ensembl_gene_id), by = 'entrezgene') %>%
left_join(genes_info, by='ID') %>% mutate(gene.score = gene.score %>% as.character %>% as.factor)
p1 = ggplotly(plot_data %>% ggplot(aes(gene.score, MeanExpr, fill=gene.score)) +
geom_boxplot(outlier.colour='#cccccc', outlier.shape='o', outlier.size=3) +
scale_fill_manual(values=SFARI_colour_hue(r=c(1:3,8,7))) + theme_minimal() +
theme(legend.position='none'))
p2 = ggplotly(plot_data %>% ggplot(aes(gene.score, SDExpr, fill=gene.score)) +
geom_boxplot(outlier.colour='#cccccc', outlier.shape='o', outlier.size=3) +
scale_fill_manual(values=SFARI_colour_hue(r=c(1:3,8,7))) + theme_minimal() +
ggtitle('Mean Expression (left) and SD (right) by SFARI score') +
theme(legend.position='none'))
plotly::subplot(p1, p2, nrows=1)
#Return to normalised version of the data
datExpr = datExpr_prep
datMeta = datMeta_prep
genes_info = genes_info_prep
rm(plot_data, p1, p2, datExpr_prep, datMeta_prep, genes_info_prep)
The proportion of over- and under-expressed genes in each SFARI Gene score is not very different to the proportion in the genes iwth Neuronal annotations nor in the rest of the genes (good, something less to worry about)
aux = genes_info %>% dplyr::select(ID, log2FoldChange, gene.score) %>%
left_join(data.frame('ID' = rownames(datExpr), 'meanExpr' = rowMeans(datExpr)), by = 'ID') %>%
dplyr::mutate(direction = ifelse(log2FoldChange>0, 'over-expressed', 'under-expressed'))
plot_data = aux %>% group_by(gene.score, direction) %>% tally(name = 'p') %>%
left_join(aux %>% group_by(gene.score) %>% tally, by = 'gene.score') %>% mutate(p = p/n, y=1)
plot_data %>% ggplot(aes(gene.score, p, fill=direction)) + geom_bar(stat='identity') +
geom_hline(yintercept = mean(plot_data$p[plot_data$direction=='under-expressed']),
linetype = 'dashed', color = 'white') +
ylab('Proportion') + xlab('SFARI Gene Scores') +
ggtitle('Direction of Fold-Change in genes by SFARI Score') + theme_minimal()
rm(aux)
The higher the SFARI Gene score, the lower the LFC Magnitude of the genes, the difference is barely statistically significant when comparing the scores with their neighbours, but it becomes stronger when comparing them with the rest of the groups
Note: For clarity, the plot was truncated removing some outlier values
increase = 0.04
base = 0.41
pos_y_comparisons = c(rep(base, 4), rep(base + increase, 2), base + 2:5*increase)
genes_info %>% ggplot(aes(gene.score, abs(log2FoldChange), fill=gene.score)) +
geom_boxplot(outlier.colour='#cccccc', outlier.shape='o', outlier.size=3) +
stat_compare_means(comparisons = comparisons, label = 'p.signif', method = 't.test',
method.args = list(var.equal = FALSE), label.y = pos_y_comparisons, tip.length = .003) +
scale_fill_manual(values=SFARI_colour_hue(r=c(1:3,8,7))) +
coord_cartesian(ylim = c(0, max(pos_y_comparisons))) +
xlab('SFARI Gene Scores') + ylab('LFC Magnitude') +
theme_minimal() + theme(legend.position='none')
rm(increase, base, pos_y_comparisons)
We know that in general there is a negative relation between mean expression and LFC in genes, and we also know that there is a strong relation between SFARI Gene Scores and the mean level of expression of the genes
This could explain the behaviour we found above, but we want to see if, once you control for the level of expression, the SFARI genes continue to have this relation to LFC or if it dissapears. (Being optimistic, perhaps the SFARI genes actually have higher LFC than genes with similar levels of expression, but we can’t see this in the original plot because of the relation between level of expression and LFC)
plot_data = genes_info %>% dplyr::select(ID, log2FoldChange, gene.score, significant) %>%
left_join(data.frame('ID' = rownames(datExpr), 'meanExpr' = rowMeans(datExpr)),
by = 'ID') %>%
mutate(alpha = ifelse(gene.score == 'Others' , 0.1, ifelse(gene.score == 'Neuronal', 0.3, 0.7)))
increase = 1.8
base = 12.2
pos_y_comparisons = c(rep(base, 4), rep(base + increase, 2), base + 2:5*increase)
p1 = plot_data %>% ggplot(aes(gene.score, meanExpr, fill=gene.score)) +
geom_boxplot(outlier.colour='#cccccc', outlier.shape='o', outlier.size=3) +
stat_compare_means(comparisons = comparisons, label = 'p.signif', method = 't.test',
method.args = list(var.equal = FALSE), label.y = pos_y_comparisons, tip.length = .02) +
scale_fill_manual(values=SFARI_colour_hue(r=c(1:3,8,7))) +
xlab('SFARI Gene Scores') + ylab('Mean Expression') +
theme_minimal() + theme(legend.position='none')
p2 = plot_data %>% ggplot(aes(meanExpr, abs(log2FoldChange), color = gene.score)) +
geom_point(alpha=plot_data$alpha) + geom_smooth(method='lm', color='#999999') +
ylab('LogFoldChange Magnitude') + xlab('Mean Expression') +
scale_color_manual(values=SFARI_colour_hue(r=c(1:3,8,7))) +
theme_minimal() + theme(legend.position = 'none')
p2 = ggExtra::ggMarginal(p2, type = 'density', groupColour = TRUE, size = 10)
grid.arrange(p2, p1, ncol=2, widths = c(0.6, 0.4))
rm(p1, p2)
This plot shows that our above hypothesis was correct but perhaps doesn’t play a role as relevant as we had thought: In general, the higher the level of expression, the lower the LFC Magnitude, but this pattern flattens for the genes with the highest levels of expression of the whole dataset, which is the region where the SFARI Genes are
plot_data = data.frame('meanExpr' = rowMeans(datExpr), 'LFC_magnitude' = abs(genes_info$log2FoldChange),
'gene.score' = genes_info$gene.score, 'p' = NA) %>% arrange(meanExpr)
w = 1000
for(i in 1:(nrow(plot_data)-w)){
plot_data$p[i+floor(w/2)] = mean(plot_data$LFC_magnitude[i:(i+w)])
}
aux_data = plot_data %>% filter(!gene.score %in% c('Neuronal','Others')) %>% group_by(gene.score) %>%
dplyr::summarise(mean_by_score = mean(meanExpr)) %>% ungroup %>%
mutate('color' = SFARI_colour_hue(r=1:6)[1:3])
ggplotly(plot_data %>% filter(!is.na(p)) %>% ggplot(aes(meanExpr, p)) + geom_line() +
xlab('Mean Level of Expression') + ylab('Sliding Average of LFC Magnitude') +
geom_vline(data = aux_data, aes(xintercept = mean_by_score), color = aux_data$color) +
ggtitle('Sliding Average of LFC Magnitude by Mean Level of Expression') + theme_minimal())
rm(aux_data)
We want to know what happens to the originally negative relation we found between SFARI Gene scores and lFC magnitude when we control for level of expression.
To do this, I’m going to compare each SFARI Gene with its closest non-SFARI neighbours following these steps:
Select one SFARI gene
Select its neighbours: 100 non-SFARI genes with the most similar mean level of Expression
Standardise the lFC magnitude of each of the neighbours and of the SFARI gene (using the mean and sd of the lFC magnitude of only these 101 genes)
Repeat this for each of the SFARI Genes, saving the standardised lFC magnitudes of all the SFARI genes and all the neighbours
Compare the distribution of this value between these two groups (SFARI and their neighbours)
This plot shows the general idea of steps 1, 2, and 3, selecting a random SFARI gene:
The plot on the left shows the original mean expression and lFC magnitude of the SFARI Gene and its 100 closest neighbours
The plot on the right shows the standardised lFC mangitude of the genes, and the vertical lines represent the value that is going to be recorded for each of this genes to be compared afterwards
n = 100
plot_data = genes_info %>% dplyr::select(ID, log2FoldChange, gene.score) %>%
left_join(data.frame('ID' = rownames(datExpr), 'meanExpr' = rowMeans(datExpr)),
by = 'ID')
SFARI_gene = plot_data %>% filter(gene.score %in% c('1','2','3','4','5','6')) %>% sample_n(1) %>%
mutate(d=0, alpha = 1)
nn = plot_data %>% filter(gene.score %in% c('Neuronal','Others')) %>%
mutate(d = abs(meanExpr-SFARI_gene$meanExpr), alpha=0.5) %>% top_n(n=-n, wt = d)
plot_data = rbind(SFARI_gene, nn) %>%
mutate(std_magnitude = (abs(log2FoldChange) - mean(abs(log2FoldChange)))/sd(abs(log2FoldChange)))
p1 = plot_data %>% ggplot(aes(meanExpr, abs(log2FoldChange), color = gene.score)) +
geom_point(alpha = plot_data$alpha) + xlab('Mean Expression') + ylab('Log2 Fold Change Magnitude') +
scale_color_manual(values=SFARI_colour_hue(r=c(as.numeric(SFARI_gene$gene.score),8,7))) +
theme_minimal() + theme(legend.position='none')
p2 = plot_data %>% ggplot(aes(meanExpr, std_magnitude, color = gene.score)) +
geom_point(alpha = plot_data$alpha) +
geom_hline(aes(yintercept = mean(std_magnitude)), linetype = 'dashed', color = '#999999') +
scale_color_manual(values=SFARI_colour_hue(r=c(as.numeric(SFARI_gene$gene.score),8,7))) +
geom_segment(aes(x=SFARI_gene$meanExpr, y=mean(std_magnitude), xend = SFARI_gene$meanExpr,
yend = std_magnitude[1]), alpha = 0.5,
color = SFARI_colour_hue(r=1:8)[as.numeric(SFARI_gene$gene.score)]) +
xlab('Mean Expression') + ylab('Standardised LFC Magnitude') +
theme_minimal() + theme(legend.position='none')
for(i in 1:15){
random_sample = plot_data %>% filter(gene.score != SFARI_gene$gene.score) %>% sample_n(1)
p2 = p2 + geom_segment(x=random_sample$meanExpr, xend = random_sample$meanExpr, y=mean(plot_data$std_magnitude),
yend = random_sample$std_magnitude, alpha = 0.5, color = 'gray')
}
grid.arrange(p1, p2, ncol=2, top='Comparing SFARI Genes with their n closest neighbours by Mean Expression')
cat(paste0('SFARI gene\'s standardised distance to its neigbours\'s LFC magnitude: ',
round(plot_data$std_magnitude[1],4)))
## SFARI gene's standardised distance to its neigbours's LFC magnitude: -0.9666
rm(p1, p2, SFARI_gene, nn, random_sample, i)
As steps 4, and 5, say, we repeat this for all of the SFARI Genes, recording their standardised mangnitude as well as the ones from their neighbours so we can study them all together
Even when controlling for the relation between Mean Expression and LFC by comparing each SFARI Gene only with neighbouring genes, we see similar results, just not as strong as before
Neuronal genes have consistently higher magnitudes of LFC than non-SFARI, non-neuronal genes (makes sense)
SFARI Genes have similar LFC Magnitude than genes without Neuronal annotations and lower than genes with neuronal annotations
get_std_lfc_magnitudes = function(data_info, SFARI_score, n){
SFARI_genes = data_info %>% filter(gene.score == as.character(SFARI_score))
std_magnitudes = data.frame(gene.score = as.character(), std_magnitude = as.numeric)
for(i in 1:nrow(SFARI_genes)){
SFARI_gene = SFARI_genes[i,]
nn = data_info %>% filter(gene.score %in% c('Neuronal','Others')) %>%
mutate(d = abs(meanExpr-SFARI_gene$meanExpr)) %>% top_n(n=-n, wt = d) %>% dplyr::select(-d)
iter_data = rbind(SFARI_gene, nn) %>%
mutate(std_magnitude = (abs(log2FoldChange) - mean(abs(log2FoldChange)))/sd(abs(log2FoldChange))) %>%
dplyr::select(gene.score, std_magnitude)
std_magnitudes = rbind(std_magnitudes, iter_data)
}
return(std_magnitudes)
}
create_plot_by_SFARI_score = function(score, n) {
std_magnitudes = get_std_lfc_magnitudes(data_info, score, n)
plot = std_magnitudes %>% ggplot(aes(gene.score, std_magnitude)) +
geom_boxplot(aes(fill = gene.score), outlier.colour='#cccccc', outlier.shape='o', outlier.size=3) +
xlab('') + ylab('Standardised LFC Magnitude') +
scale_fill_manual(values=SFARI_colour_hue(r=c(score,8,7))) +
coord_cartesian(ylim = c(min(std_magnitudes$std_magnitude), 3)) +
stat_compare_means(method = 't.test', method.args = list(var.equal = FALSE), label = 'p.signif',
ref.group = as.character(score), label.y = 3) +
theme_minimal() + theme(legend.position = 'none')
return(plot)
}
data_info = genes_info %>% dplyr::select(ID, log2FoldChange, gene.score) %>%
left_join(data.frame('ID' = rownames(datExpr), 'meanExpr' = rowMeans(datExpr)),
by = 'ID')
p1 = create_plot_by_SFARI_score(1, n)
p2 = create_plot_by_SFARI_score(2, n)
p3 = create_plot_by_SFARI_score(3, n)
grid.arrange(p1, p2, p3, nrow=1,
top = 'Comparison of LFC Magnitude of SFARI gens and their closest neighbours by Mean Expression')
rm(p1, p2, p3)
With only 2 differentially expressed genes there’s not much point in doing this
The patterns found in Gandal can also be found in this dataset
There aren’t enough differentially expressed genes to study their patterns
In the case of the Log Fold Change, we do see the same patterns, and they are statistically significant as well
The patterns weren’t as clean or significant in Gupta’s dataset as they were in Gandal and our hypothesis for that was an increased variance in the dataset. To corroborate this I compared this dataset’s variance with Gandal: in general, both datasets share a similar variance except for the genes with the lowest levels of expression, which have an inflated variance in this dataset
Wright_datExpr = datExpr
Wright_datGenes = datGenes
Wright_genes_info = genes_info
load('./../../../Gandal/AllRegions/Data/preprocessed_data.RData')
Gandal_datExpr = datExpr
plot_data = data.frame('ID' = rownames(Gandal_datExpr), 'Gandal_SD' = rowSdDiffs(Gandal_datExpr),
'MeanExpression' = rowMeans(Gandal_datExpr)) %>%
left_join(Wright_datGenes %>% data.frame %>% dplyr::select(entrezgene, ensembl_gene_id) %>%
dplyr::rename('ID'=ensembl_gene_id) %>% mutate(entrezgene = entrezgene %>% as.factor),
by = 'ID') %>%
inner_join(data.frame('ID' = rownames(Wright_datExpr),
'Wright_SD' = rowSdDiffs(Wright_datExpr)),
by = 'ID') %>%
left_join(genes_info %>% data.frame, by = 'ID') %>%
mutate(diff = Gandal_SD-Wright_SD, abs_diff = abs(Gandal_SD-Wright_SD)) %>%
mutate(std_diff = (diff-mean(diff))/sd(diff), distance = abs((diff-mean(diff))/sd(diff)))
plot_data %>% ggplot(aes(Gandal_SD, Wright_SD)) + geom_point(alpha=0.1, aes(color=MeanExpression)) +
geom_abline(slope = 1, intercept = 0, color = 'gray', linetype = 'dashed') +
geom_smooth(alpha = 0.1, color = 'gray') + xlab('Gandal') + ylab('Wright') +
coord_fixed() + scale_x_continuous(limits = c(0, max(plot_data$Wright_SD))) +
scale_colour_viridis(, begin=0.1) +
ggtitle('SD Comparison between Datasets') + theme_minimal()
rm(datExpr, datMeta, datGenes, dds, DE_info, Wright_datExpr, Wright_datGenes, Wright_genes_info,
Gandal_datExpr, plot_data)
sessionInfo()
## R version 3.6.3 (2020-02-29)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.4 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1
##
## locale:
## [1] LC_CTYPE=en_GB.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_GB.UTF-8 LC_COLLATE=en_GB.UTF-8
## [5] LC_MONETARY=en_GB.UTF-8 LC_MESSAGES=en_GB.UTF-8
## [7] LC_PAPER=en_GB.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] parallel stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] kableExtra_1.1.0 knitr_1.28
## [3] expss_0.10.2 DESeq2_1.24.0
## [5] SummarizedExperiment_1.14.1 DelayedArray_0.10.0
## [7] BiocParallel_1.18.1 matrixStats_0.56.0
## [9] Biobase_2.44.0 GenomicRanges_1.36.1
## [11] GenomeInfoDb_1.20.0 IRanges_2.18.3
## [13] S4Vectors_0.22.1 BiocGenerics_0.30.0
## [15] ClusterR_1.2.1 gtools_3.8.2
## [17] Rtsne_0.15 ggpubr_0.2.5
## [19] magrittr_1.5 GGally_1.5.0
## [21] gridExtra_2.3 viridis_0.5.1
## [23] viridisLite_0.3.0 RColorBrewer_1.1-2
## [25] plotlyutils_0.0.0.9000 plotly_4.9.2
## [27] glue_1.4.1 reshape2_1.4.4
## [29] forcats_0.5.0 stringr_1.4.0
## [31] dplyr_1.0.0 purrr_0.3.4
## [33] readr_1.3.1 tidyr_1.1.0
## [35] tibble_3.0.1 ggplot2_3.3.2
## [37] tidyverse_1.3.0
##
## loaded via a namespace (and not attached):
## [1] readxl_1.3.1 backports_1.1.8 Hmisc_4.4-0
## [4] plyr_1.8.6 lazyeval_0.2.2 splines_3.6.3
## [7] gmp_0.5-13.6 crosstalk_1.1.0.1 digest_0.6.25
## [10] htmltools_0.4.0 fansi_0.4.1 checkmate_2.0.0
## [13] memoise_1.1.0 cluster_2.1.0 annotate_1.62.0
## [16] modelr_0.1.6 jpeg_0.1-8.1 colorspace_1.4-1
## [19] blob_1.2.1 rvest_0.3.5 haven_2.2.0
## [22] xfun_0.12 crayon_1.3.4 RCurl_1.98-1.2
## [25] jsonlite_1.7.0 genefilter_1.66.0 survival_3.1-12
## [28] gtable_0.3.0 zlibbioc_1.30.0 XVector_0.24.0
## [31] webshot_0.5.2 scales_1.1.1 DBI_1.1.0
## [34] miniUI_0.1.1.1 Rcpp_1.0.4.6 xtable_1.8-4
## [37] htmlTable_1.13.3 foreign_0.8-76 bit_1.1-15.2
## [40] Formula_1.2-3 htmlwidgets_1.5.1 httr_1.4.1
## [43] acepack_1.4.1 ellipsis_0.3.1 pkgconfig_2.0.3
## [46] reshape_0.8.8 XML_3.99-0.3 farver_2.0.3
## [49] nnet_7.3-14 dbplyr_1.4.2 locfit_1.5-9.4
## [52] later_1.0.0 tidyselect_1.1.0 labeling_0.3
## [55] rlang_0.4.6 AnnotationDbi_1.46.1 munsell_0.5.0
## [58] cellranger_1.1.0 tools_3.6.3 cli_2.0.2
## [61] generics_0.0.2 RSQLite_2.2.0 broom_0.5.5
## [64] fastmap_1.0.1 evaluate_0.14 yaml_2.2.1
## [67] bit64_0.9-7 fs_1.4.0 nlme_3.1-147
## [70] mime_0.9 ggExtra_0.9 xml2_1.2.5
## [73] compiler_3.6.3 rstudioapi_0.11 png_0.1-7
## [76] ggsignif_0.6.0 reprex_0.3.0 geneplotter_1.62.0
## [79] stringi_1.4.6 highr_0.8 lattice_0.20-41
## [82] Matrix_1.2-18 vctrs_0.3.1 pillar_1.4.4
## [85] lifecycle_0.2.0 data.table_1.12.8 bitops_1.0-6
## [88] httpuv_1.5.2 R6_2.4.1 latticeExtra_0.6-29
## [91] promises_1.1.0 assertthat_0.2.1 withr_2.2.0
## [94] GenomeInfoDbData_1.2.1 mgcv_1.8-31 hms_0.5.3
## [97] grid_3.6.3 rpart_4.1-15 rmarkdown_2.1
## [100] shiny_1.4.0.2 lubridate_1.7.4 base64enc_0.1-3